DATATHON

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
✔ broom        1.0.10     ✔ rsample      1.3.1 
✔ dials        1.4.2      ✔ tailor       0.1.0 
✔ infer        1.0.9      ✔ tune         2.0.0 
✔ modeldata    1.5.1      ✔ workflows    1.3.0 
✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
✔ recipes      1.3.1      ✔ yardstick    1.3.2 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(ggplot2)
library(tidyr)
library(lubridate)
library(zoo) 

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(RcppRoll) 
library(prophet)
Loading required package: Rcpp

Attaching package: 'Rcpp'

The following object is masked from 'package:rsample':

    populate

Loading required package: rlang

Attaching package: 'rlang'

The following objects are masked from 'package:purrr':

    %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice
library(corrplot) 
corrplot 0.95 loaded
library(anomalize) 
library(tsibble)
Registered S3 method overwritten by 'tsibble':
  method               from 
  as_tibble.grouped_df dplyr

Attaching package: 'tsibble'

The following object is masked from 'package:zoo':

    index

The following object is masked from 'package:lubridate':

    interval

The following objects are masked from 'package:base':

    intersect, setdiff, union
library(xgboost) 

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
library(ranger) 
library(cluster)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(rsample)
library(fable)
Loading required package: fabletools

Attaching package: 'fabletools'

The following object is masked from 'package:yardstick':

    accuracy

The following object is masked from 'package:parsnip':

    null_model

The following objects are masked from 'package:infer':

    generate, hypothesize
library(dplyr)

data <- read.csv("~/Downloads/dynamic_supply_chain_logistics_dataset.csv")

data <- data %>%
  mutate(timestamp = as.POSIXct(timestamp, tz = "UTC"),
         date = as_date(timestamp)) %>%
  group_by(date) %>%
  summarise(
    # SUM for flows
    historical_demand = sum(historical_demand, na.rm = TRUE),
    shipping_costs    = sum(shipping_costs, na.rm = TRUE),

    # MEAN for continuous metrics
    fuel_consumption_rate    = mean(fuel_consumption_rate, na.rm = TRUE),
    eta_variation_hours      = mean(eta_variation_hours, na.rm = TRUE),
    traffic_congestion_level = mean(traffic_congestion_level, na.rm = TRUE),
    warehouse_inventory_level= mean(warehouse_inventory_level, na.rm = TRUE),
    lead_time_days           = mean(lead_time_days, na.rm = TRUE),
    delivery_time_deviation  = mean(delivery_time_deviation, na.rm = TRUE),
    delay_probability        = mean(delay_probability, na.rm = TRUE),
    loading_unloading_time   = mean(loading_unloading_time, na.rm = TRUE),
    customs_clearance_time   = mean(customs_clearance_time, na.rm = TRUE),
    driver_behavior_score    = mean(driver_behavior_score, na.rm = TRUE),
    fatigue_monitoring_score = mean(fatigue_monitoring_score, na.rm = TRUE),

    high_risk_fraction = mean(risk_classification == "High Risk", na.rm = TRUE),

    order_fulfillment_status      = names(sort(table(order_fulfillment_status),decreasing=TRUE))[1],
    cargo_condition_status        = names(sort(table(cargo_condition_status),decreasing=TRUE))[1],
    handling_equipment_availability = names(sort(table(handling_equipment_availability),decreasing=TRUE))[1],
    weather_condition_severity    = names(sort(table(weather_condition_severity),decreasing=TRUE))[1],

    .groups = "drop"
  ) %>%
  mutate(date = as_date(date)) %>%
  arrange(date)

categorical_cols_for_factor <- c("order_fulfillment_status",
                                  "cargo_condition_status",
                                  "handling_equipment_availability",
                                  "weather_condition_severity")

# Convert them to factors
data <- data %>%
  mutate(across(all_of(categorical_cols_for_factor), as.factor))

data <- data %>%
  mutate(across(where(is.numeric), ~zoo::na.locf(.x, na.rm = FALSE)))
categorical_cols_for_factor <- c("order_fulfillment_status", "weather_condition_severity",
                      "cargo_condition_status",
                      "handling_equipment_availability")
data <- data %>%
  mutate(across(all_of(categorical_cols_for_factor), as.factor))


data <- data %>%
  mutate(high_risk_fraction = as.numeric(high_risk_fraction)) 
df_fe <- data %>%
  mutate(
    day_of_week   = wday(date, label = TRUE, abbr = FALSE), 
    week_of_year  = week(date),
    month         = month(date, label = TRUE, abbr = FALSE), 
    is_weekend    = factor(ifelse(wday(date) %in% c(1, 7), 1, 0), levels = c(0,1), labels = c("No", "Yes")),

    demand_lag_1   = lag(historical_demand, 1),
    demand_lag_7   = lag(historical_demand, 7),
    demand_lag_30  = lag(historical_demand, 30),
    
     demand_ma_7   = rollmean(historical_demand, 7, fill = NA, align = "right"),
    fuel_ma_7     = rollmean(fuel_consumption_rate, 7, fill = NA, align = "right"),
    eta_ma_7      = rollmean(eta_variation_hours, 7, fill = NA, align = "right"),
    risk_ma_7     = rollmean(high_risk_fraction, 7, fill = NA, align = "right"),

    log_demand    = log1p(historical_demand),
    log_shipping  = log1p(shipping_costs),

    fuel_consumption_rate_ma_7 = roll_meanr(fuel_consumption_rate, n = 7, fill = NA),
    eta_variation_hours_ma_7   = roll_meanr(eta_variation_hours, n = 7, fill = NA)
  ) %>%
  drop_na() 
transport_features <- df_fe %>%
  select(fuel_consumption_rate, eta_variation_hours, traffic_congestion_level)

transport_scaled <- scale(transport_features)

sil_scores <- sapply(2:6, function(k){
  set.seed(123)
  km_tmp <- kmeans(transport_scaled, centers = k, nstart = 25)
  mean(silhouette(km_tmp$cluster, dist(transport_scaled))[, 3])
})
best_k <- which.max(sil_scores) + 1

set.seed(123)
km <- kmeans(transport_scaled, centers = best_k, nstart = 25)

df_fe$transport_mode <- factor(km$cluster, labels = paste0("Mode_", 1:best_k))

cluster_summary <- aggregate(transport_features,
                             by = list(transport_mode = df_fe$transport_mode),
                             FUN = mean)
print(cluster_summary)
  transport_mode fuel_consumption_rate eta_variation_hours
1         Mode_1              7.789716            3.151687
2         Mode_2              9.138842            2.928089
3         Mode_3              7.674467            2.331206
4         Mode_4              7.557288            3.120366
  traffic_congestion_level
1                 5.800782
2                 4.913704
3                 4.983456
4                 4.389308
fviz_cluster(km, data = transport_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             main = paste("K-means Clustering (k =", best_k, ")"))

Mode_2 clearly has higher fuel → long-haul or heavy transport.

Mode_3 has low ETA variation → likely light vehicles or predictable routes.

Modes 1 & 4 are in-between but differ in congestion/ETA pattern

df_fe <- df_fe %>%
  mutate(
    transport_costs = fuel_consumption_rate + traffic_congestion_level + driver_behavior_score,
    infrastructure_costs = warehouse_inventory_level + loading_unloading_time,
    risk_costs = delay_probability + delivery_time_deviation,
    total_cost = transport_costs + infrastructure_costs + risk_costs
  ) %>%
  mutate(
    transport_costs_bin = cut(transport_costs, breaks = 3, labels = c("Low", "Medium", "High")),
    infrastructure_costs_bin = cut(infrastructure_costs, breaks = 3, labels = c("Low", "Medium", "High")),
    risk_costs_bin = cut(risk_costs, breaks = 3, labels = c("Low", "Medium", "High"))
  )

df_fe %>%
  group_by(transport_mode) %>%
  summarise(
    mean_transport_costs = mean(transport_costs),
    mean_infrastructure_costs = mean(infrastructure_costs),
    mean_risk_costs = mean(risk_costs),
    mean_total_cost = mean(total_cost)
  )
# A tibble: 4 × 5
  transport_mode mean_transport_costs mean_infrastructure_costs mean_risk_costs
  <fct>                         <dbl>                     <dbl>           <dbl>
1 Mode_1                         14.1                      304.            5.90
2 Mode_2                         14.6                      297.            5.85
3 Mode_3                         13.2                      305.            5.84
4 Mode_4                         12.4                      300.            5.90
# ℹ 1 more variable: mean_total_cost <dbl>
ggplot(df_fe, aes(x = date, y = historical_demand)) +
  geom_line(alpha = 0.6) +
  geom_smooth(method = "loess", span = 0.2, color = "blue", se = FALSE) + # Replaced smooth_vec
  labs(title = "Historical Demand Over Time",
       y = "Historical Demand") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

# Congestion vs ETA
# Ensure traffic_congestion_level is treated as a factor if it represents discrete levels
ggplot(df_fe, aes(x = factor(traffic_congestion_level), y = eta_variation_hours)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "ETA Variation by Traffic Congestion Level",
       x = "Traffic Congestion Level (Factor)",
       y = "ETA Variation (Hours)") +
  theme_minimal()

# Weather vs Shipping Costs
ggplot(df_fe, aes(x = weather_condition_severity, y = shipping_costs, fill = weather_condition_severity)) +
  geom_boxplot() +
  labs(title = "Shipping Costs by Weather Severity",
       x = "Weather Condition Severity",
       y = "Shipping Costs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Correlation Matrix
M <- df_fe %>%
  select(where(is.numeric)) %>% # Select all numeric columns for correlation
  select(-week_of_year) %>% # Remove week_of_year if you don't want it in correlation
                            # or any other numeric columns that are IDs like date components
  cor(use = "pairwise.complete.obs") # use="pairwise.complete.obs" handles NAs in correlation calc

corrplot(M, method = "circle", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

df_fe %>%
  as_tsibble(index = date) %>%
  time_decompose(fuel_consumption_rate, method = "stl", frequency = "week") %>% # Changed trend = "periodic" to frequency = "week"
  anomalize(remainder, method = "iqr", alpha = 0.05) %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE) +
  labs(title = "Fuel Consumption Anomalies") +
  theme_minimal()
Converting from tbl_ts to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 91 days
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

# ETA anomalies
df_fe %>%
  as_tsibble(index = date) %>%
  time_decompose(eta_variation_hours, method = "stl", frequency = "week") %>% # Changed trend = "periodic" to frequency = "week"
  anomalize(remainder, method = "iqr", alpha = 0.05) %>%
  time_recompose() %>%
  plot_anomalies(time_recomposed = TRUE) +
  labs(title = "ETA Variation Anomalies") +
  theme_minimal()
Converting from tbl_ts to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 91 days

Decomposed costs into transport infastrcuture and risk

df_fe <- df_fe %>% arrange(date)

data_split <- initial_time_split(df_fe, prop = 0.8)
train_data <- training(data_split)
test_data  <- testing(data_split)

df_prophet_train <- train_data %>%
  select(ds = date, y = historical_demand)

m_prophet <- prophet(df_prophet_train)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m_prophet, periods = nrow(test_data), freq = "day")
forecast <- predict(m_prophet, future)

plot(m_prophet, forecast) +
  labs(title = "Prophet Forecast for Historical Demand") +
  theme_minimal()

prophet_plot_components(m_prophet, forecast)

df_cv_prophet <- cross_validation(m_prophet, initial = 365.25, period = 30, horizon = 30, units = "days")
Making 22 forecasts with cutoffs between 2022-02-19 and 2023-11-11
prophet_performance <- performance_metrics(df_cv_prophet)
print("Prophet Cross-Validation Metrics:")
[1] "Prophet Cross-Validation Metrics:"
print(head(prophet_performance))
  horizon       mse     rmse      mae       mape      mdape      smape
1  3 days 269628341 16420.36 12764.31 0.08901472 0.07471041 0.08810970
2  4 days 282472343 16806.91 13094.86 0.09358643 0.07444243 0.09141565
3  5 days 333855022 18271.70 14099.88 0.10264370 0.07783679 0.09934604
4  6 days 316277572 17784.19 13852.44 0.09886024 0.08027146 0.09660425
5  7 days 322418433 17956.01 14033.55 0.09973085 0.08482533 0.09758253
6  8 days 339016089 18412.39 14922.99 0.10917429 0.09488640 0.10515899
   coverage
1 0.8030303
2 0.8030303
3 0.7878788
4 0.7727273
5 0.7575758
6 0.7575758
plot_cross_validation_metric(df_cv_prophet, metric = 'rmse')

set.seed(1)

rolling_cv <- rolling_origin(df_fe, initial = 365, assess = 30, cumulative = FALSE, skip = 30 )

rolling_cv
# Rolling origin forecast resampling 
# A tibble: 30 × 2
   splits           id     
   <list>           <chr>  
 1 <split [365/30]> Slice01
 2 <split [365/30]> Slice02
 3 <split [365/30]> Slice03
 4 <split [365/30]> Slice04
 5 <split [365/30]> Slice05
 6 <split [365/30]> Slice06
 7 <split [365/30]> Slice07
 8 <split [365/30]> Slice08
 9 <split [365/30]> Slice09
10 <split [365/30]> Slice10
# ℹ 20 more rows
delivery_recipe <- 
  recipe(delivery_time_deviation ~ ., data = train_data) %>%
  update_role(date, new_role = "ID") %>%  # Keep date as ID
  step_rm(order_fulfillment_status, cargo_condition_status,
          handling_equipment_availability) %>%
  step_integer(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

# Random Forest model with fixed parameters
rf_model <- 
  rand_forest(
    mtry = 5,      # number of predictors sampled at each split
    trees = 1000,  # number of trees
    min_n = 5      # minimum node size
  ) %>%
  set_engine("ranger", importance = "permutation") %>%
  set_mode("regression")

# Fit workflow
rf_fit <- workflow() %>%
  add_recipe(delivery_recipe) %>%
  add_model(rf_model) %>%
  fit(data = train_data)

# Predict on test data
rf_predictions <- predict(rf_fit, new_data = test_data) %>%
  bind_cols(test_data %>% select(date, delivery_time_deviation))

# Calculate metrics
mae_rf  <- mae(rf_predictions, truth = delivery_time_deviation, estimate = .pred)
rmse_rf <- rmse(rf_predictions, truth = delivery_time_deviation, estimate = .pred)
rsq_rf  <- rsq(rf_predictions, truth = delivery_time_deviation, estimate = .pred)

print(mae_rf)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 mae     standard       0.162
print(rmse_rf)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.265
print(rsq_rf)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.943
# Predicted vs Actual scatter plot
ggplot(rf_predictions, aes(x = delivery_time_deviation, y = .pred)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Predicted vs Actual Delivery Time Deviation (Random Forest)",
    subtitle = paste0("MAE: ", round(mae_rf$.estimate, 4), 
                      ", RMSE: ", round(rmse_rf$.estimate, 4)),
    x = "Actual Delivery Time Deviation",
    y = "Predicted Delivery Time Deviation"
  ) +
  theme_minimal() +
  coord_fixed()

# Actual vs Predicted over time
ggplot(rf_predictions, aes(x = date)) +
  geom_line(aes(y = delivery_time_deviation, color = "Actual")) +
  geom_line(aes(y = .pred, color = "Predicted"), linetype = "dashed") +
  labs(
    title = "Actual vs Predicted Delivery Time Deviation Over Time",
    x = "Date",
    y = "Delivery Time Deviation",
    color = "Type"
  ) +
  theme_minimal()

rf_workflow <- workflow() %>%
  add_recipe(delivery_recipe) %>%
  add_model(rf_model)

set.seed(2)
rf_cv_results <- fit_resamples(
  rf_workflow,
  resamples = rolling_cv,
  metrics = metric_set(rmse, mae, rsq),
  control = control_resamples(save_pred = TRUE)
)

print("Random Forest Cross-Validation Results:")
[1] "Random Forest Cross-Validation Results:"
collect_metrics(rf_cv_results)
# A tibble: 3 × 6
  .metric .estimator  mean     n std_err .config        
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
1 mae     standard   0.223    30 0.00696 pre0_mod0_post0
2 rmse    standard   0.316    30 0.0128  pre0_mod0_post0
3 rsq     standard   0.936    30 0.00398 pre0_mod0_post0
df_fe_risk <- df_fe %>% arrange(date)

risk_split <- initial_time_split(df_fe_risk, prop = 0.8)
risk_train <- training(risk_split)
risk_test  <- testing(risk_split)

message("Risk Train data date range: ", min(risk_train$date), " to ", max(risk_train$date))
Risk Train data date range: 2021-01-31 to 2023-12-11
message("Risk Test data date range: ", min(risk_test$date), " to ", max(risk_test$date))
Risk Test data date range: 2023-12-12 to 2024-08-29
risk_recipe <- recipe(high_risk_fraction ~ ., data = risk_train) %>%
  update_role(date, new_role = "ID") %>%
  step_rm(order_fulfillment_status, cargo_condition_status,
          handling_equipment_availability, weather_condition_severity) %>%
  step_integer(all_nominal_predictors()) %>%
  step_impute_mean(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

xgb_model <- boost_tree(
  trees      = 500,
  tree_depth = 6,
  learn_rate = 0.05,
  min_n      = 10
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_workflow <- workflow() %>%
  add_recipe(risk_recipe) %>%
  add_model(xgb_model)

xgb_fit <- xgb_workflow %>%
  fit(data = risk_train)

xgb_predictions <- predict(xgb_fit, new_data = risk_test) %>%
  bind_cols(risk_test %>% select(date, high_risk_fraction))

xgb_predictions <- xgb_predictions %>%
  mutate(.pred = pmax(0, pmin(1, .pred)))

mae_xgb <- mae(xgb_predictions, truth = high_risk_fraction, estimate = .pred)
rmse_xgb <- rmse(xgb_predictions, truth = high_risk_fraction, estimate = .pred)
rsq_xgb <- rsq(xgb_predictions, truth = high_risk_fraction, estimate = .pred)

print(mae_xgb)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 mae     standard      0.0682
print(rmse_xgb)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0884
print(rsq_xgb)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0452
ggplot(xgb_predictions, aes(x = high_risk_fraction, y = .pred)) +
  geom_point(alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Predicted vs Actual High Risk Fraction (XGBoost)",
    subtitle = paste0("MAE: ", round(mae_xgb$.estimate, 4),
                      ", RMSE: ", round(rmse_xgb$.estimate, 4)),
    x = "Actual High Risk Fraction",
    y = "Predicted High Risk Fraction"
  ) +
  theme_minimal() +
  coord_fixed()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

ggplot(xgb_predictions, aes(x = date)) +
  geom_line(aes(y = high_risk_fraction, color = "Actual")) +
  geom_line(aes(y = .pred, color = "Predicted"), linetype = "dashed") +
  labs(
    title = "Actual vs Predicted High Risk Fraction Over Time",
    x = "Date",
    y = "High Risk Fraction",
    color = "Type"
  ) +
  theme_minimal()

set.seed(3)
xgb_cv_results <- fit_resamples(
  xgb_workflow,
  resamples = rolling_cv,
  metrics = metric_set(rmse, mae, rsq),
  control = control_resamples(save_pred = TRUE)
)

print("XGBoost Cross-Validation Results:")
[1] "XGBoost Cross-Validation Results:"
collect_metrics(xgb_cv_results)
# A tibble: 3 × 6
  .metric .estimator   mean     n std_err .config        
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>          
1 mae     standard   0.0708    30 0.00152 pre0_mod0_post0
2 rmse    standard   0.0877    30 0.00186 pre0_mod0_post0
3 rsq     standard   0.0595    30 0.0117  pre0_mod0_post0
df_bottom <- df_fe %>%
  select(date, transport_costs, infrastructure_costs, risk_costs) 

df_bottom_long <- df_bottom %>%
  pivot_longer(
    cols = c(transport_costs, infrastructure_costs, risk_costs),
    names_to = "cost_type",
    values_to = "value"
  ) %>%
  as_tsibble(index = date, key = cost_type)

df_hier <- df_bottom_long %>%
  aggregate_key(. ~ cost_type, value = sum(value))

df_hier
# A tsibble: 5,228 x 3 [1D]
# Key:       cost_type [4]
Loading required namespace: crayon
   date       cost_type    value
   <date>     <chr*>       <dbl>
 1 2021-01-31 <aggregated>  223.
 2 2021-02-01 <aggregated>  258.
 3 2021-02-02 <aggregated>  242.
 4 2021-02-03 <aggregated>  393.
 5 2021-02-04 <aggregated>  335.
 6 2021-02-05 <aggregated>  321.
 7 2021-02-06 <aggregated>  271.
 8 2021-02-07 <aggregated>  218.
 9 2021-02-08 <aggregated>  347.
10 2021-02-09 <aggregated>  300.
# ℹ 5,218 more rows